Introduction

The Vehicle Sales and Market Trends Dataset provides a comprehensive collection of information pertaining to the sales transactions of various vehicles.

The objective of this analysis is to explore sales distribution across different states.

The objectives are clear, the questions to abroad are the following:

Data Preprocessing

Meta data

A short preview of the data frame.

head(cars)
## # A tibble: 6 × 16
##    year make   model     trim  body  transmission vin   state condition odometer
##   <dbl> <chr>  <chr>     <chr> <chr> <chr>        <chr> <chr>     <dbl>    <dbl>
## 1  2015 Kia    Sorento   LX    SUV   automatic    5xyk… ca            5    16639
## 2  2015 Kia    Sorento   LX    SUV   automatic    5xyk… ca            5     9393
## 3  2014 BMW    3 Series  328i… Sedan automatic    wba3… ca           45     1331
## 4  2015 Volvo  S60       T5    Sedan automatic    yv16… ca           41    14282
## 5  2014 BMW    6 Series… 650i  Sedan automatic    wba6… ca           43     2641
## 6  2015 Nissan Altima    2.5 S Sedan automatic    1n4a… ca            1     5554
## # ℹ 6 more variables: color <chr>, interior <chr>, seller <chr>, mmr <dbl>,
## #   sellingprice <dbl>, saledate <chr>

Missing values, minimums, maximums, means and percentiles and more…

skim_without_charts(cars)
Data summary
Name cars
Number of rows 558837
Number of columns 16
_______________________
Column type frequency:
character 11
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
make 10301 0.98 2 13 0 96 0
model 10399 0.98 1 29 0 973 0
trim 10651 0.98 1 46 0 1963 0
body 13195 0.98 3 23 0 87 0
transmission 65352 0.88 5 9 0 4 0
vin 4 1.00 9 17 0 550297 0
state 0 1.00 2 17 0 64 0
color 749 1.00 1 9 0 46 0
interior 749 1.00 1 9 0 17 0
seller 0 1.00 3 50 0 14263 0
saledate 12 1.00 4 39 0 3766 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
year 0 1.00 2010.04 3.97 1982 2007 2012 2013 2015
condition 11820 0.98 30.67 13.40 1 23 35 42 49
odometer 94 1.00 68320.02 53398.54 1 28371 52254 99109 999999
mmr 38 1.00 13769.38 9679.97 25 7100 12250 18300 182000
sellingprice 12 1.00 13611.36 9749.50 1 6900 12100 18200 230000

Column names

colnames(cars)
##  [1] "year"         "make"         "model"        "trim"         "body"        
##  [6] "transmission" "vin"          "state"        "condition"    "odometer"    
## [11] "color"        "interior"     "seller"       "mmr"          "sellingprice"
## [16] "saledate"

Cleaning

Deleting observations that contains states with incorrect values, such as ‘3vwd17aj5fm297123’. And observations with empty values on the columns: seller, sellingprice and model.

cars2 <- cars %>% 
  filter(nchar(state) <= 2) %>%
  drop_na(saledate, seller, sellingprice, model)

nrow(cars) #Observations before
## [1] 558837
nrow(cars2) #Observations after
## [1] 548400

Sales Distribution Across States

Calculate the number of sold cars in each state.

table(cars2$state)
## 
##    ab    al    az    ca    co    fl    ga    hi    il    in    la    ma    md 
##   801    25  8575 71658  7661 81480 34056  1222 23188  4273  2164  6525 10841 
##    mi    mn    mo    ms    nc    ne    nj    nm    ns    nv    ny    oh    ok 
## 15287  9301 15798  1814 21333  3954 27331   164    46 12453  5609 21209    69 
##    on    or    pa    pr    qc    sc    tn    tx    ut    va    wa    wi 
##  3008  1136 53154  2673  1108  4146 20691 45195  1798 11694  7256  9704

Visualizing the distribution of sales per state.

ggplot(cars2, aes(x = reorder(state, table(state)[state]))) +
  geom_bar() +
  labs(title = "Number of Cars Sold by State", x = "State", y = "Number of Cars Sold")

Which states have the highest and lowest car sales?

Answer = The top 5 states with highest sales are: Fl, Ca, Pa, Tx and Ga. On the other hand, the top 5 states with lowest sales are: Al, Ns, Ok, Nm and Ab.

Average Selling Price by State

Find the average selling price for cars in each state.

avg_price_by_state <- aggregate(sellingprice ~ state, data = cars2, FUN = mean)
avg <- avg_price_by_state[order(avg_price_by_state$sellingprice, decreasing=TRUE),]
kable(avg, caption = "Average selling price by state")
Average selling price by state
state sellingprice
27 on 18232.231
33 tn 17069.398
29 pa 16038.392
5 co 15985.261
23 nv 15152.356
14 mi 14982.873
9 il 14852.803
16 mo 14557.157
4 ca 14540.312
25 oh 14380.163
37 wa 14077.437
38 wi 14054.818
31 qc 14003.339
6 fl 13976.670
15 mn 13789.779
20 nj 13626.459
34 tx 13296.173
7 ga 12990.106
19 ne 12681.879
8 hi 12296.154
35 ut 12227.997
1 ab 11793.009
28 or 11667.738
3 az 11627.498
24 ny 11479.330
17 ms 11364.407
10 in 10793.289
11 la 10592.970
32 sc 10429.096
22 ns 10041.304
30 pr 9528.077
2 al 9440.000
18 nc 8744.494
36 va 8684.673
13 md 7504.937
26 ok 6978.261
12 ma 6789.080
21 nm 6442.683

Compare prices across states.

#Setting the plot for the average selling price for cars in each state.
avg_plot <- ggplot(avg_price_by_state, aes(x = reorder(state, sellingprice), y = sellingprice, 
                                           text = paste("State: ", state, "<br>",
                                                        "Average Price: $", round(sellingprice, 2)))) +
  geom_bar(stat = "identity") +
  labs(title = "Average Selling Price by State", x = "State", y = "Average Selling Price") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

interactive_plot <- ggplotly(avg_plot, tooltip = "text")

# Disable zoom, pan, and other controls
interactive_plot <- interactive_plot %>%
  config(displayModeBar = FALSE,  
         scrollZoom = FALSE,     
         showTips = FALSE)   

# Display the customized interactive plot
interactive_plot

How does the average selling price differ by state?

Answer = The state with the highest average selling price is On with $18,232 and the state with the lowest average selling price is Nm with $6,442.

State-wise Analysis of Car Makes and Models

Identify the most popular car makes and models in each state.

# Select the top 6 states by total sales
top_states <- cars2 %>%
  group_by(state) %>%
  summarise(total_sales = n()) %>%
  top_n(6, total_sales) %>%
  pull(state)

# Select the top 10 makes by total sales
top_makes <- cars2 %>%
  group_by(make) %>%
  summarise(total_sales = n()) %>%
  top_n(10, total_sales) %>%
  pull(make)

# Filter the dataset to include only the top states and top makes
sales <- cars2 %>%
  filter(state %in% top_states, make %in% top_makes) %>%
  group_by(make, state) %>%
  summarise(counter = n()) %>%
  ungroup()

Visualizing the top 10 car makes and models for the top 6 states

# Plotting with vertical bars
sales %>%
  mutate(make_fact = make,
         make = reorder_within(make, counter, state)) %>%
  ggplot(aes(make, counter, fill = make_fact)) +  
  geom_col(show.legend = FALSE) +
  scale_fill_brewer(palette = "Set3") +  
  scale_x_reordered() +  
  facet_wrap(~state, scales = "free_x") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

Heat-map limited to just the top 20 car makes and states, identify the top car makes and states based on the number of cars sold. Similar process as above.

# Calculate total sales by car make and state
total_make_sales <- cars2 %>%
  group_by(make) %>%
  summarise(total_sales = n()) %>%
  arrange(desc(total_sales)) %>%
  top_n(20, total_sales) %>%
  pull(make)

total_state_sales <- cars2 %>%
  group_by(state) %>%
  summarise(total_sales = n()) %>%
  arrange(desc(total_sales)) %>%
  top_n(20, total_sales) %>%
  pull(state)

# Filter the Data for Top 20 Car Makes and States
filtered_sales <- cars2 %>%
  filter(make %in% total_make_sales & state %in% total_state_sales) %>%
  group_by(state, make) %>%
  summarise(count = n()) %>%
  ungroup()

# Create the tooltip text
filtered_sales <- filtered_sales %>%
  mutate(text = paste0("State: ", state, "\n",
                       "Make: ", make, "\n",
                       "Cars Sold: ", count))

# Create the Heatmap
p <- ggplot(filtered_sales, aes(x = state, y = make, fill = count, text = text)) + 
  geom_tile() +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_ipsum() +
  labs(title = "Popularity of Top 20 Car Makes by Top 20 States",
       x = "State",
       y = "Car Make",
       fill = "Cars Sold")

# Adding interactivity with plotly
ggplotly(p, tooltip = "text")

Conclusion

The data suggests that car preferences can vary significantly across states. To better understand what influences buyers, a deeper investigation is required. Variables such as price, year, odometer, and color should be considered.

Extra findings

Some trends and relations have been found during the analysis.

Transmission: Automatic transmissions are increasingly popular in all states, representing 96% of all car sales.

# Select the top 12 states by total sales
top_states <- cars2 %>%
  group_by(state) %>%
  summarise(total_sales = n()) %>%
  top_n(12, total_sales) %>%
  pull(state)

# Droping N/A transmission values
cars_f <- cars2 %>%
  filter(state %in% top_states) %>%
  drop_na(transmission)

# Plotting
ggplot(data = cars_f) +
  geom_bar(mapping = aes(x = transmission, fill = transmission)) +  
  facet_wrap(~state) +
  labs(title = "Car Sales by Transmission Type in Top 12 States",
       x = "Transmission Type",
       y = "Number of Cars Sold") 

ggplot(data = cars_f) +
  geom_bar(mapping = aes(y = transmission, fill = transmission)) +  
  labs(title = "Total car sales by transmission",
       x = "Transmission Type",
       y = "Number of Cars Sold") +
      scale_x_continuous(labels = comma)

# Calculate the percentage of automatic transmission sales
auto_percentage <- mean(cars_f$transmission == "automatic") * 100
(auto_percentage)
## [1] 96.6728

Price - Sells: The majority of vehicles sold are priced between $1,701 and $12,00, with a 44% of the total sales.

# Calculate the percentages for each price range
quartiles2 <- cut(cars2$sellingprice,
                 breaks = c(1, 1700, 12200, 18300, 230000),
                 labels = c("$1,700", "$12,200", "$18,300", "$230,000"),
                 include.lowest = TRUE)

cars_prices <- cars2 %>%
  mutate(price_quartile = quartiles2)

price_distribution <- cars_prices %>%
  group_by(price_quartile) %>%
  summarise(count = n()) %>%
  mutate(percentage = round((count / sum(count)) * 100, 1))

# Create the plot
ggplot(price_distribution, aes(x = 1, y = percentage, fill = price_quartile)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = paste0(percentage, "%")), position = position_stack(vjust = 0.5), color = "white") +
  labs(title = "Distribution of Car Sales by Price Range",
       x = "Range of car price",
       y = "Percentage of Total Sales") +
  scale_fill_brewer(palette = "Set2") 

# Calculate the percentage of sales within the price range $1,701 to $12,200, to verify.
price_percentage <- mean(cars2$sellingprice >= 1701 & cars2$sellingprice <= 12200) * 100
price_percentage
## [1] 44.43508